Netflix EDA¶


InĀ [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from datetime import datetime,timedelta
from itertools import combinations, islice
from collections import Counter
import operator
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

Import the dataset downloaded from kaggle

InĀ [2]:
df = pd.read_csv('netflix-rotten-tomatoes-metacritic-imdb.csv')

Dataset exploration

InĀ [3]:
df.head()
Out[3]:
Title Genre Tags Languages Series or Movie Hidden Gem Score Country Availability Runtime Director Writer Actors View Rating IMDb Score Rotten Tomatoes Score Metacritic Score Awards Received Awards Nominated For Boxoffice Release Date Netflix Release Date Production House Netflix Link IMDb Link Summary IMDb Votes Image Poster TMDb Trailer Trailer Site
0 Lets Fight Ghost Crime, Drama, Fantasy, Horror, Romance Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant, Per Ragnar, Lina Leandersson, ... R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
1 HOW TO BUILD A GIRL Comedy Dramas,Comedies,Films Based on Books,British English Movie 7.0 Canada 1-2 hour Coky Giedroyc Caitlin Moran Paddy Considine, Cleo, Beanie Feldstein, Dónal... R 5.8 79.0 69.0 1.0 NaN $70,632 08 May 2020 2021-03-04 Film 4, Monumental Pictures, Lionsgate https://www.netflix.com/watch/81041267 https://www.imdb.com/title/tt4193072 When nerdy Johanna moves to London, things get... 2838.0 https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... https://m.media-amazon.com/images/M/MV5BZGUyN2... https://www.youtube.com/watch?v=eIbcxPy4okQ YouTube
2 Centigrade Drama, Thriller Thrillers English Movie 6.4 Canada 1-2 hour Brendan Walsh Brendan Walsh, Daley Nixon Genesis Rodriguez, Vincent Piazza Unrated 4.3 NaN 46.0 NaN NaN $16,263 28 Aug 2020 2021-03-04 NaN https://www.netflix.com/watch/81305978 https://www.imdb.com/title/tt8945942 Trapped in a frozen car during a blizzard, a p... 1720.0 https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... https://m.media-amazon.com/images/M/MV5BODM2MD... https://www.youtube.com/watch?v=0RvV7TNUlkQ YouTube
3 ANNE+ Drama TV Dramas,Romantic TV Dramas,Dutch TV Shows Turkish Series 7.7 Belgium,Netherlands < 30 minutes NaN NaN Vahide PerƧin, Gonca Vuslateri, Cansu Dere, Be... NaN 6.5 NaN NaN 1.0 NaN NaN 01 Oct 2016 2021-03-04 NaN https://www.netflix.com/watch/81336456 https://www.imdb.com/title/tt6132758 Upon moving into a new place, a 20-something r... 1147.0 https://occ-0-1489-1490.1.nflxso.net/dnm/api/v... https://m.media-amazon.com/images/M/MV5BNWRkMz... NaN NaN
4 Moxie Animation, Short, Drama Social Issue Dramas,Teen Movies,Dramas,Comedie... English Movie 8.1 Lithuania,Poland,France,Iceland,Italy,Spain,Gr... 1-2 hour Stephen Irwin NaN Ragga Gudrun NaN 6.3 NaN NaN NaN 4.0 NaN 22 Sep 2011 2021-03-04 NaN https://www.netflix.com/watch/81078393 https://www.imdb.com/title/tt2023611 Inspired by her moms rebellious past and a con... 63.0 https://occ-0-4039-1500.1.nflxso.net/dnm/api/v... https://m.media-amazon.com/images/M/MV5BODYyNW... NaN NaN
InĀ [4]:
df.shape
Out[4]:
(15480, 29)
InĀ [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15480 entries, 0 to 15479
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Title                  15480 non-null  object 
 1   Genre                  13770 non-null  object 
 2   Tags                   15413 non-null  object 
 3   Languages              13545 non-null  object 
 4   Series or Movie        15480 non-null  object 
 5   Hidden Gem Score       13379 non-null  float64
 6   Country Availability   15461 non-null  object 
 7   Runtime                15479 non-null  object 
 8   Director               10772 non-null  object 
 9   Writer                 11150 non-null  object 
 10  Actors                 13555 non-null  object 
 11  View Rating            8456 non-null   object 
 12  IMDb Score             13381 non-null  float64
 13  Rotten Tomatoes Score  6382 non-null   float64
 14  Metacritic Score       4336 non-null   float64
 15  Awards Received        6075 non-null   float64
 16  Awards Nominated For   7661 non-null   float64
 17  Boxoffice              4007 non-null   object 
 18  Release Date           13373 non-null  object 
 19  Netflix Release Date   15480 non-null  object 
 20  Production House       5149 non-null   object 
 21  Netflix Link           15480 non-null  object 
 22  IMDb Link              13177 non-null  object 
 23  Summary                15471 non-null  object 
 24  IMDb Votes             13379 non-null  float64
 25  Image                  15480 non-null  object 
 26  Poster                 11842 non-null  object 
 27  TMDb Trailer           7194 non-null   object 
 28  Trailer Site           7194 non-null   object 
dtypes: float64(7), object(22)
memory usage: 3.4+ MB
InĀ [6]:
df.describe().T
Out[6]:
count mean std min 25% 50% 75% max
Hidden Gem Score 13379.0 5.937551 2.250202 0.6 3.8 6.8 7.9 9.8
IMDb Score 13381.0 6.496054 1.146910 1.0 5.8 6.6 7.3 9.7
Rotten Tomatoes Score 6382.0 59.523034 26.999173 0.0 38.0 64.0 83.0 100.0
Metacritic Score 4336.0 56.813653 17.582545 5.0 44.0 57.0 70.0 100.0
Awards Received 6075.0 8.764444 18.311171 1.0 1.0 3.0 8.0 300.0
Awards Nominated For 7661.0 13.983161 29.821052 1.0 2.0 5.0 12.0 386.0
IMDb Votes 13379.0 42728.411615 125701.191329 5.0 403.5 2322.0 20890.5 2354197.0

Ratings evaluation between different platforms

InĀ [7]:
# Create a dataframe containing scores from the platforms scaling IMDB to 100
scores_df = pd.DataFrame({'imdb_scores': df['IMDb Score']*10,
                          'rt_scores': df['Rotten Tomatoes Score'],
                          'mc_scores': df['Metacritic Score']})
InĀ [8]:
# Different Platfomrms Scores densities
plt.figure(figsize=(12,5), dpi=150)
sns.kdeplot(data=scores_df, x='imdb_scores', label='IMDb Score', fill=True)
sns.kdeplot(data=scores_df, x='rt_scores', label= 'Rotten Tomatoes Score', fill=True)
sns.kdeplot(data=scores_df, x='mc_scores', label= 'Metacritic Score', fill=True)
plt.legend(loc=('upper left'))
plt.xlim(0,100)
plt.title('IMDb vs Rotten Tomatoes vs Metacritic score densities')
plt.xlabel('Score %')
plt.show()
No description has been provided for this image

Columns selection

InĀ [9]:
selected_columns = ['Title', 'Genre', 'Languages', 'Series or Movie', 'Hidden Gem Score', 'Runtime',
                    'Director', 'Actors', 'IMDb Score', 'IMDb Votes', 'Awards Nominated For', 'Awards Received',
                    'Boxoffice', 'Netflix Release Date', 'Country Availability']
df_selected = df[selected_columns].copy()

df_selected.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15480 entries, 0 to 15479
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 15480 non-null  object 
 1   Genre                 13770 non-null  object 
 2   Languages             13545 non-null  object 
 3   Series or Movie       15480 non-null  object 
 4   Hidden Gem Score      13379 non-null  float64
 5   Runtime               15479 non-null  object 
 6   Director              10772 non-null  object 
 7   Actors                13555 non-null  object 
 8   IMDb Score            13381 non-null  float64
 9   IMDb Votes            13379 non-null  float64
 10  Awards Nominated For  7661 non-null   float64
 11  Awards Received       6075 non-null   float64
 12  Boxoffice             4007 non-null   object 
 13  Netflix Release Date  15480 non-null  object 
 14  Country Availability  15461 non-null  object 
dtypes: float64(5), object(10)
memory usage: 1.8+ MB
InĀ [10]:
df_selected.shape
Out[10]:
(15480, 15)

Dates maniplulation

InĀ [11]:
# Convert the 'Netflix Release Date' column to a datetime format
df_selected['Netflix Release Date'] = pd.to_datetime(df_selected['Netflix Release Date'])

# Extract year and month info
df_selected['Year'] = df_selected['Netflix Release Date'].dt.year
df_selected['Month'] = df_selected['Netflix Release Date'].dt.month
InĀ [12]:
df_selected.dtypes
Out[12]:
Title                           object
Genre                           object
Languages                       object
Series or Movie                 object
Hidden Gem Score               float64
Runtime                         object
Director                        object
Actors                          object
IMDb Score                     float64
IMDb Votes                     float64
Awards Nominated For           float64
Awards Received                float64
Boxoffice                       object
Netflix Release Date    datetime64[ns]
Country Availability            object
Year                             int64
Month                            int64
dtype: object

Top Genres¶

Create a dataset to work with genres

InĀ [13]:
df_gen = df_selected.copy()
# Drop rows with missing values in the 'Genre', 'IMDb Score', and 'IMDb Votes' columns
df_gen.dropna(subset=['Genre','IMDb Score','IMDb Votes'],inplace=True)
InĀ [14]:
# Manipulate 'Genre' column and explode into separate rows
df_gen['Genre'] = df['Genre'].astype("string")
df_gen['Genre']=df_gen['Genre'].map(lambda x:x.split(','))
df_gen =df_gen.explode('Genre').reset_index(drop=True)
df_gen['Genre'] = df_gen['Genre'].str.strip()
InĀ [15]:
# Filter the dataframe to include only rows with 'IMDb Score' greater than the mean 'IMDb Score'
df_gen = df_gen[df_gen['IMDb Score'] > df_gen['IMDb Score'].mean()]

# Create a new column 'mul_rating' as a weighted average of 'IMDb Score' and 'IMDb Votes'
df_gen['mul_rating'] =((0.2*df_gen['IMDb Score'] + 0.8*df_gen['IMDb Votes'])/((0.2*df_gen['IMDb Score'] + 0.8*df_gen['IMDb Votes']).max()))*100
InĀ [16]:
df_gen.describe().T
Out[16]:
count mean std min 25% 50% 75% max
Hidden Gem Score 18689.0 5.884280 2.162107 1.700000 3.900000 5.80000 8.100000 9.8
IMDb Score 18689.0 7.381791 0.567072 6.600000 6.900000 7.30000 7.800000 9.7
IMDb Votes 18689.0 71812.519557 168118.728762 5.000000 991.000000 5753.00000 59010.000000 2354197.0
Awards Nominated For 12899.0 20.844872 38.008842 1.000000 3.000000 8.00000 21.000000 386.0
Awards Received 11022.0 12.107149 22.572374 1.000000 2.000000 5.00000 12.000000 300.0
Year 18689.0 2017.387394 1.976495 2015.000000 2015.000000 2017.00000 2019.000000 2021.0
Month 18689.0 6.496228 3.430340 1.000000 4.000000 6.00000 10.000000 12.0
mul_rating 18689.0 3.050479 7.141229 0.000282 0.042165 0.24445 2.506655 100.0
InĀ [17]:
# Group the dataframe by 'Genre' and compute average 'IMDb Score', total 'IMDb Votes', and mean 'mul_rating'
df_gen_plot = df_gen.groupby('Genre').agg({'IMDb Score': 'mean',
                                           'IMDb Votes': 'sum',
                                           'mul_rating': 'mean'}).reset_index().sort_values('mul_rating', ascending=False)
# Create a copy of the dataframe with min-max normalized 'mul_rating'
normalized_df = df_gen_plot.copy()
normalized_df['mul_rating'] = (df_gen_plot['mul_rating'] - df_gen_plot['mul_rating'].min()) / (df_gen_plot['mul_rating'].max() - df_gen_plot['mul_rating'].min()) * 10

Plot Genres vs Rating

InĀ [18]:
# Create a bar plot of the normalized ratings for each genre
plt.figure(figsize=(12,5), dpi=150)
sns.barplot(data= normalized_df, x='Genre', y='mul_rating', palette='rocket')
plt.title('Genres Rating')
plt.ylabel('Rating')
plt.xlabel('Genre')
plt.xticks(rotation = 45, ha='right')
plt.show()
No description has been provided for this image

Popular Genres per Year¶

InĀ [19]:
# Grouping the data by year and genre to get the number of titles per genre per year
genre_per_year = df_gen.groupby(['Year', 'Genre'])['Year'].count().reset_index(name='Count of Genres per Year')

# Sorting the data by year and count of genres per year in descending order
genre_per_year = genre_per_year.sort_values(['Year', 'Count of Genres per Year'], ascending=False)

# Selecting the data for years between 2016 and 2020
genre_per_year = genre_per_year[(genre_per_year['Year'] >= 2016) & (genre_per_year['Year'] <= 2020)]

# Selecting the top 5 genres for each year
top_genres_per_year = genre_per_year.groupby('Year').head(5).reset_index(drop=True)
InĀ [20]:
# Creating a bar plot to show the top 5 genres for each year
plt.figure(figsize=(12, 5))
palette = 'flare'
sns.barplot(data=top_genres_per_year, x='Year', y='Count of Genres per Year', hue='Genre', palette=palette, dodge=False)
plt.ylim(0, 900)
plt.title('Top 5 Genres per Year')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend(loc='center right', bbox_to_anchor=(1.3, 0.5))
plt.tight_layout()
plt.show()
No description has been provided for this image

Yearly series and movies production¶

InĀ [21]:
# Group the data by year and series/movie
df_prod = df_selected.groupby(['Year', 'Series or Movie']).agg({'Title': 'count'}).reset_index()

# Filter the data to only include years between 2016 and 2020
df_prod = df_prod[(df_prod['Year'] >= 2016) & (df_prod['Year'] <= 2020)]
InĀ [22]:
# Creating a bar plot to show the number of Movie and Series titles each year

sns.barplot(data=df_prod, x="Year", y="Title", hue="Series or Movie", palette='rocket')
plt.title('Number of Netflix Titles Released per Year')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend(title='', loc='upper left')
plt.show()
No description has been provided for this image

Acting Duos¶

InĀ [23]:
# Select relevant columns and rows with valid actors
df_act = df_selected[['Title', 'Actors']].dropna()
df_act['Actors'] = df_act['Actors'].astype(str).str.split(',')

# Create list of actor pairs and count occurrences using Counter
comblist = []
for actors in df_act['Actors']:
    comb = combinations(actors, 2)
    for pair in comb:
        comblist.append(pair)
c = Counter(comblist)

# Get top 15 pairs and plot a scatter plot
pairs = c.most_common(15)
pairs1 = np.array(pairs)
lispa1 = list(map(lambda x: list(x[0]), pairs))
lispa2 = list(map(lambda x: x[1], pairs))
b = list(map(lambda x: '-'.join(x), lispa1))
InĀ [24]:
#Counting the number of times pairs of actors have appeared in movies together
plt.figure(figsize=(12, 5), dpi=150)
plt.scatter(b, lispa2, alpha=0.8, s=100, edgecolors='black', c='firebrick')
plt.xticks(rotation=45, ha='right')
plt.title('Most Dynamic Acting Duos', fontsize=16)
plt.xlabel('Acting Duos', fontsize=14)
plt.ylabel('Number of Movies', fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image

Top Directors¶

InĀ [25]:
# Extract directors data from the DataFrame
df_dir = df[df['Director'].notna()]
df_dir['Director'] = df_dir['Director'].astype("string")
df_dir['Director']=df_dir['Director'].map(lambda x:x.split(','))
df_dir = df_dir.explode('Director').reset_index(drop=True)

# Count the number of movies each director has directed
df_dir['Director_Count'] = df_dir['Director'].map(df_dir['Director'].value_counts())

# Aggregate the data based on director and calculate relevant metrics
df_dir1 = df_dir.groupby('Director').agg({'Hidden Gem Score':np.mean,
                                          'IMDb Score':np.mean,
                                          'IMDb Votes':np.sum,
                                          'Director_Count':pd.Series.count,
                                          'Awards Received':np.sum}).reset_index()

# Filter the data to only include directors with above-average IMDb Votes, Director_Count, and Awards Received, and sort by IMDb Score
df_dir2 = df_dir1[(df_dir1['IMDb Votes'] > df_dir1['IMDb Votes'].mean())
       & (df_dir1['Director_Count'] > df_dir1['Director_Count'].mean())
       & (df_dir1['Awards Received'] > df_dir1['Awards Received'].mean())].sort_values(['IMDb Score',
                                                                                        'IMDb Votes'], ascending=False)[['Director','IMDb Score']].head(10)

Scatterplot

InĀ [26]:
# Plot a scatterplot of IMDb Score vs. Director for the top 10 directors
plt.figure(figsize=(12, 5), dpi=150)
plt.scatter(x=df_dir2['Director'], y=df_dir2['IMDb Score'], alpha=0.8, s=100, edgecolors='black', c='red')
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.title('Top Directors according to IMDb Score', fontsize=16, color='white')
plt.xlabel('Director', fontsize=14)
plt.ylabel('IMDb Score', fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image

Barplot

InĀ [27]:
fig = px.bar(df_dir2, x='Director', y='IMDb Score', color='IMDb Score', 
             color_continuous_scale=px.colors.sequential.Reds)
fig.update_layout(title='Top Directors according to IMDb Score', 
                  xaxis_title='Director', yaxis_title='IMDb Score',
                  xaxis_tickangle=-45)
fig.show()

Top Genre-Diverse Actors¶

InĀ [28]:
# Creating a dataframe to search for genre diverse actors

df_act = df[df['Actors'].notna()]
df_act['Actors'] = df_act['Actors'].astype("string")
df_act['Actors']=df_act['Actors'].map(lambda x:x.split(','))
df_act=df_act.explode('Actors').reset_index(drop=True)
df_act
df_act = df_act[df_act['Genre'].notna()]
df_act['Genre'] = df_act['Genre'].astype("string")
df_act['Genre']=df_act['Genre'].map(lambda x:x.split(','))
df_act=df_act.explode('Genre').reset_index(drop=True)
df_act['Genre'] = df_act['Genre'].str.strip()
df_act.head()
Out[28]:
Title Genre Tags Languages Series or Movie Hidden Gem Score Country Availability Runtime Director Writer Actors View Rating IMDb Score Rotten Tomatoes Score Metacritic Score Awards Received Awards Nominated For Boxoffice Release Date Netflix Release Date Production House Netflix Link IMDb Link Summary IMDb Votes Image Poster TMDb Trailer Trailer Site
0 Lets Fight Ghost Crime Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
1 Lets Fight Ghost Drama Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
2 Lets Fight Ghost Fantasy Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
3 Lets Fight Ghost Horror Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
4 Lets Fight Ghost Romance Comedy Programmes,Romantic TV Comedies,Horror ... Swedish, Spanish Series 4.3 Thailand < 30 minutes Tomas Alfredson John Ajvide Lindqvist KƄre Hedebrant R 7.9 98.0 82.0 74.0 57.0 $2,122,065 12 Dec 2008 2021-03-04 Canal+, Sandrew Metronome https://www.netflix.com/watch/81415947 https://www.imdb.com/title/tt1139797 A med student with a supernatural gift tries t... 205926.0 https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... https://m.media-amazon.com/images/M/MV5BOWM4NT... NaN NaN
InĀ [29]:
#Top Genre-Diverse Actors
df_act2=df_act.groupby('Actors').nunique()[['Genre','Title']].reset_index()
df_act_div = df_act2.sort_values('Genre',ascending=False).head(10)

Barplot

InĀ [30]:
sns.barplot(data=df_act_div, x="Actors", y="Genre", palette='flare')
plt.title('Top Genre-Diverse Actors')
plt.xticks(rotation = 45, ha='right')
plt.xlabel('Actors')
plt.ylabel('Genres')
plt.show()
No description has been provided for this image

Top Genre-Specific Actors¶

InĀ [31]:
# Top genre-specific actors

df_act2['Difference'] = df_act2['Title']-df_act2['Genre']
df_act_spec = df_act2.sort_values('Difference',ascending=False).head(10)
InĀ [32]:
#Plot a barplot for Genre-Specific Actors

f, ax = plt.subplots(figsize=(6, 6))

# Plot the total titles for each actor
sns.set_color_codes("pastel")
sns.barplot(x="Title", y="Actors", data=df_act_spec,
            label="Title", color='r')

# Plot the corresponding genres
sns.set_color_codes("muted")
sns.barplot(x="Genre", y="Actors", data=df_act_spec,
            label="Genre", color='r')

ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 24), ylabel="",
       xlabel="Top Genre-Specific Actors")
sns.despine(left=True, bottom=True)
No description has been provided for this image

Box Office¶

InĀ [33]:
# Creating a dataframe for Box Office

df_boxoffice = df_gen.dropna(subset='Boxoffice')
InĀ [34]:
df_boxoffice['Boxoffice'] = df_boxoffice['Boxoffice'].replace(r'[\$,]','',regex=True)
df_boxoffice['Boxoffice'] = pd.to_numeric(df_boxoffice['Boxoffice'])
InĀ [35]:
# Creating two dataframes for Series and Movies Box Office

df_boxoffice_series = df_boxoffice[df_boxoffice['Series or Movie'] == 'Series'].groupby(['Genre']).agg({'Boxoffice' : np.sum}).sort_values('Boxoffice', ascending=False).reset_index().head(10)
df_boxoffice_movies = df_boxoffice[df_boxoffice['Series or Movie'] == 'Movie'].groupby(['Genre']).agg({'Boxoffice' : np.sum}).sort_values('Boxoffice', ascending=False).reset_index().head(10)
InĀ [36]:
# Plot a Barplot for Series Box Office by Genre

sns.barplot(data=df_boxoffice_series, x="Genre", y="Boxoffice", palette='flare')
plt.xticks(rotation = 45, ha='right')
plt.title('Series Box Office by Genre')
plt.ylabel('Box Office in million $')
plt.show()
No description has been provided for this image
InĀ [37]:
# Plot a Barplot for Movies Box Office by Genre

sns.barplot(data=df_boxoffice_movies, x="Genre", y="Boxoffice", palette='flare')
plt.xticks(rotation = 45, ha='right')
plt.title('Movies Box Office by Genre')
plt.ylabel('Box Office in 10 milion $')
plt.savefig('figure1.png', transparent=True,bbox_inches='tight') # save as png

plt.show()
No description has been provided for this image